Module Preservation | Approach: MF vs Mostafavi
Pairwise comparisons
# Input: expression matrix and modules
########## Dataset 01
dir_expr_01 = "/pastel/projects/speakeasy_dlpfc/"
dir_mod_01 = "/pastel/projects/speakeasy_dlpfc/SpeakEasy_net_MF/"
# Residuals of the expression:
load(paste0(dir_expr_01, "exprdata_byregion.Rdata"))
res_dataset_01 = exprData_MF # Residuals of the expression
res_dataset_01 = as.data.frame(t(res_dataset_01))
# Remove ENS version
colnames(res_dataset_01) = gsub("(.*)\\.(.*)", "\\1", colnames(res_dataset_01))
rownames(res_dataset_01) = gsub("(.*)_(.*)", "\\2", rownames(res_dataset_01))
# clusters from SpeakEasy:
k_dataset_01 = read.table(paste0(dir_mod_01, "geneBycluster.txt"), header = T, stringsAsFactors = F)
k_dataset_01$ensembl = gsub("(.*)\\.(.*)", "\\1", k_dataset_01$ensembl)
# Checking order
# all(k_dataset_01$ensembl == colnames(res_dataset_01))
modules_dataset_01 = setNames(k_dataset_01$cluster_lv3, k_dataset_01$ensembl)
# all(names(modules_dataset_01) == colnames(res_dataset_01))
# dim(res_dataset_01) # 1210 17309
# length(modules_dataset_01) # 17309
## Dataset 1 is ready!
########## Dataset 02
dir_expr_02 = "/pastel/projects/speakeasy_dlpfc/mostafavi/"
dir_mod_02 = "/pastel/projects/speakeasy_dlpfc/mostafavi/"
dat = read_rds(paste0(dir_expr_02, "DLPFC_gene_sara_v1.rds"))
exp_mat = assay(dat) %>% as.data.frame()
sample_meta = colData(dat) %>% as.data.frame()
gene_meta = elementMetadata(dat) %>% as.data.frame()
colnames(exp_mat) = gsub("(.*)\\:(.*)","\\1",colnames(exp_mat)) # Fix projIds
# Match expression with genes with modules
k_dataset_02 = read.table(paste0(dir_mod_02, "mRNA_annotation.txt"), header = F, stringsAsFactors = F)
colnames(k_dataset_02) = c("symbol", "cluster", "ensembl")
# length(k_dataset_02$symbol[k_dataset_02$cluster == 109]) # 390 genes
# dim(k_dataset_02)
# dim(exp_mat)
# Rownames in exp_mat has ensembl ID version
# sum(duplicated(gsub("(.*)\\.(.*)","\\1",rownames(exp_mat)))) # No duplicates
rownames(exp_mat) = gsub("(.*)\\.(.*)","\\1",rownames(exp_mat))
genes_in_common_testset = intersect(rownames(exp_mat),k_dataset_02$ensembl)
# length(genes_in_common_testset)
k_dataset_02 = k_dataset_02[match(genes_in_common_testset,k_dataset_02$ensembl),]
exp_mat = exp_mat[match(genes_in_common_testset,rownames(exp_mat)),]
# all(rownames(exp_mat)==k_dataset_02$ensembl)
# dim(k_dataset_02)
# Mostafavi's matrix has ensembl. We need to match both
res_dataset_02 = na.omit(as.data.frame(t(data.matrix(exp_mat))))
colnames(res_dataset_02) = k_dataset_02[match(colnames(res_dataset_02),k_dataset_02$ensembl),"ensembl"] # it can be symbol as in the ST data
# all(k_dataset_02$ensembl == colnames(res_dataset_02))
modules_dataset_02 = setNames(k_dataset_02$cluster, k_dataset_02$ensembl)
# all(names(modules_dataset_02) == colnames(res_dataset_02))
# dim(res_dataset_02) # 508 13412
# length(modules_dataset_02) # 13412
# Mostafavi's data is ready!### Now data is ready to run
## Input:
# res_dataset_01: expression matrix from dataset 01 (rows are samples, columns are genes)
# modules_dataset_01: modules from dataset 01 (named array with module labels matching columns from res_dataset_01)
# prefix_dataset_01: Some ID to add to module lables (e.g. MF_)
# res_dataset_02: expression matrix from dataset 02 (rows are samples, columns are genes)
# modules_dataset_02: modules from dataset 02 (named array with module labels matching columns from res_dataset_02)
# prefix_dataset_02: Some ID to add to module lables (e.g. MF_)
prefix_dataset_01 = "MF_"
prefix_dataset_02 = "Sara_M"
# Add module prefix
modules_dataset_01 = paste0(prefix_dataset_01, modules_dataset_01)
modules_dataset_02 = paste0(prefix_dataset_02, modules_dataset_02)
setLabels = c("dataset_01", "dataset_02")
multiExpr = list(dataset_01 = list(data = res_dataset_01), dataset_02 = list(data = res_dataset_02))
multiColor = list(dataset_01 = modules_dataset_01, dataset_02 = modules_dataset_02)
# Here comes the calculation of module preservation, it takes a while (will test both directions)
enableWGCNAThreads(nThreads = 8)
system.time( {
mp = modulePreservation(multiExpr, multiColor, dataIsExpr = T,
maxModuleSize = 20000,
referenceNetworks = c(1,2), # Will first test set 1 as reference, then set 2 as reference.
nPermutations = 200, # Default = 200
randomSeed = 2023,
quickCor = 1, # 0 is suppose to be more precise but we get high Zsummary like 200 instead of 20
verbose = 3,
parallelCalculation = T)
} )
####################
ref = 1 # MF
test = 2 # Mostafavi
statsObs_12 = cbind(Refence = "MF", Test = "Mostafavi", mp$quality$observed[[ref]][[test]][,-1], mp$preservation$observed[[ref]][[test]][,-1])
statsObs_12$module_id = rownames(statsObs_12)
statsZ_12 = cbind(mp$quality$Z[[ref]][[test]], mp$preservation$Z[[ref]][[test]][,-1])
statsZ_12$module_id = rownames(statsZ_12)
res_df_12 = statsObs_12 %>% dplyr::left_join(statsZ_12, by = c("module_id"))
####################
ref = 2 # Mostafavi
test = 1 # MF
statsObs_21 = cbind(Refence = "Mostafavi", Test = "MF", mp$quality$observed[[ref]][[test]][,-1], mp$preservation$observed[[ref]][[test]][,-1])
statsObs_21$module_id = rownames(statsObs_21)
statsZ_21 = cbind(mp$quality$Z[[ref]][[test]], mp$preservation$Z[[ref]][[test]][,-1])
statsZ_21$module_id = rownames(statsZ_21)
res_df_21 = statsObs_21 %>% dplyr::left_join(statsZ_21, by = c("module_id"))
####################
res_df = bind_rows(res_df_12, res_df_21)
# Save the results
save(mp,res_df,res_df_12,res_df_21, file = paste0(work_dir, "mp_MF_mostafavi.RData"))load(paste0(work_dir, "mp_MF_mostafavi.RData"))
createDT(res_df)Question: Are the modules from MF preserved in Mostafavi network?
Reference: Dataset 01 (MF) Test: Dataset 02 (Mostafavi)
# Compare preservation to qualit (generates a table)
p_medianrank <- ggplot(res_df_12, aes(x = moduleSize, y = medianRank.pres )) +
geom_point(shape = 21) +
geom_label_repel(aes(label = module_id)) +
scale_y_reverse() +
labs(y = "Preservation Median rank", x = "Module size", title = "Preservation Median rank") +
theme_classic()
p_zsummary <- ggplot(res_df_12, aes(x = moduleSize, y = Zsummary.pres )) +
geom_hline(yintercept = c(2,10), linetype = "dashed") +
geom_point(shape = 21) +
geom_label_repel(aes(label = module_id)) +
labs(y = "Preservation Zsummary", x = "Module size", title = "Preservation Zsummary") +
theme_classic()
# Plot the REFERENCE
ggarrange(p_medianrank, p_zsummary, ncol=2)Question: Are the modules from Mostafavi preserved at the MF network?
Reference: Dataset 02 Test: Dataset 01
# Compare preservation to qualit (generates a table)
p_medianrank <- ggplot(res_df_21, aes(x = moduleSize, y = medianRank.pres )) +
geom_point(shape = 21) +
geom_label_repel(aes(label = module_id)) +
scale_y_reverse() +
labs(y = "Preservation Median rank", x = "Module size", title = "Preservation Median rank") +
theme_classic()
p_zsummary <- ggplot(res_df_21, aes(x = moduleSize, y = Zsummary.pres )) +
geom_hline(yintercept = c(2,10), linetype = "dashed") +
geom_point(shape = 21) +
geom_label_repel(aes(label = module_id), max.overlaps = 20) +
labs(y = "Preservation Zsummary", x = "Module size", title = "Preservation Zsummary") +
theme_classic()
# Plot the REFERENCE
ggarrange(p_medianrank, p_zsummary, ncol=2)sessionInfo()## R version 4.1.2 (2021-11-01)
## Platform: x86_64-pc-linux-gnu (64-bit)
## Running under: CentOS Stream 8
##
## Matrix products: default
## BLAS/LAPACK: /usr/lib64/libopenblasp-r0.3.15.so
##
## locale:
## [1] LC_CTYPE=en_US.UTF-8 LC_NUMERIC=C
## [3] LC_TIME=en_US.UTF-8 LC_COLLATE=en_US.UTF-8
## [5] LC_MONETARY=en_US.UTF-8 LC_MESSAGES=en_US.UTF-8
## [7] LC_PAPER=en_US.UTF-8 LC_NAME=C
## [9] LC_ADDRESS=C LC_TELEPHONE=C
## [11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C
##
## attached base packages:
## [1] stats4 stats graphics grDevices utils datasets methods
## [8] base
##
## other attached packages:
## [1] SummarizedExperiment_1.24.0 Biobase_2.54.0
## [3] GenomicRanges_1.46.1 GenomeInfoDb_1.30.1
## [5] IRanges_2.28.0 S4Vectors_0.32.3
## [7] BiocGenerics_0.40.0 MatrixGenerics_1.6.0
## [9] matrixStats_0.61.0 ggeasy_0.1.3
## [11] ggpubr_0.4.0 ggrepel_0.9.1
## [13] WGCNA_1.70-3 fastcluster_1.2.3
## [15] dynamicTreeCut_1.63-1 forcats_0.5.1
## [17] stringr_1.4.0 dplyr_1.0.8
## [19] purrr_0.3.4 readr_2.1.2
## [21] tidyr_1.2.0 tibble_3.1.6
## [23] ggplot2_3.3.5 tidyverse_1.3.1
## [25] limma_3.50.1
##
## loaded via a namespace (and not attached):
## [1] colorspace_2.0-3 ggsignif_0.6.3 ellipsis_0.3.2
## [4] htmlTable_2.4.0 XVector_0.34.0 base64enc_0.1-3
## [7] fs_1.5.2 rstudioapi_0.13 farver_2.1.0
## [10] DT_0.20 bit64_4.0.5 AnnotationDbi_1.56.2
## [13] fansi_1.0.2 lubridate_1.8.0 xml2_1.3.3
## [16] codetools_0.2-18 splines_4.1.2 doParallel_1.0.17
## [19] cachem_1.0.6 impute_1.68.0 knitr_1.37
## [22] Formula_1.2-4 jsonlite_1.7.3 broom_0.7.12
## [25] cluster_2.1.2 GO.db_3.14.0 dbplyr_2.1.1
## [28] png_0.1-7 compiler_4.1.2 httr_1.4.2
## [31] backports_1.4.1 assertthat_0.2.1 Matrix_1.5-1
## [34] fastmap_1.1.0 cli_3.2.0 htmltools_0.5.2
## [37] tools_4.1.2 gtable_0.3.0 glue_1.6.1
## [40] GenomeInfoDbData_1.2.7 Rcpp_1.0.8 carData_3.0-5
## [43] cellranger_1.1.0 jquerylib_0.1.4 vctrs_0.3.8
## [46] Biostrings_2.62.0 preprocessCore_1.56.0 crosstalk_1.2.0
## [49] iterators_1.0.14 xfun_0.29 rvest_1.0.2
## [52] lifecycle_1.0.1 rstatix_0.7.0 zlibbioc_1.40.0
## [55] scales_1.1.1 hms_1.1.1 parallel_4.1.2
## [58] RColorBrewer_1.1-2 yaml_2.3.5 memoise_2.0.1
## [61] gridExtra_2.3 sass_0.4.0 rpart_4.1-15
## [64] latticeExtra_0.6-29 stringi_1.7.6 RSQLite_2.2.10
## [67] highr_0.9 foreach_1.5.2 checkmate_2.0.0
## [70] rlang_1.0.1 pkgconfig_2.0.3 bitops_1.0-7
## [73] evaluate_0.15 lattice_0.20-45 labeling_0.4.2
## [76] htmlwidgets_1.5.4 cowplot_1.1.1 bit_4.0.4
## [79] tidyselect_1.1.2 magrittr_2.0.2 R6_2.5.1
## [82] generics_0.1.2 Hmisc_4.6-0 DelayedArray_0.20.0
## [85] DBI_1.1.2 pillar_1.7.0 haven_2.4.3
## [88] foreign_0.8-81 withr_2.4.3 abind_1.4-5
## [91] survival_3.2-13 KEGGREST_1.34.0 RCurl_1.98-1.6
## [94] nnet_7.3-16 car_3.0-12 modelr_0.1.8
## [97] crayon_1.5.0 utf8_1.2.2 tzdb_0.2.0
## [100] rmarkdown_2.11 jpeg_0.1-9 grid_4.1.2
## [103] readxl_1.3.1 data.table_1.14.2 blob_1.2.2
## [106] reprex_2.0.1 digest_0.6.29 munsell_0.5.0
## [109] bslib_0.3.1